library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.5
library(stringr)
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.0.5
library(gsubfn)
## Warning: package 'gsubfn' was built under R version 4.0.5
## Loading required package: proto
## Warning: package 'proto' was built under R version 4.0.5
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.0.5

Read in the data set

injuries <- read.csv("C:/Users/wduff/OneDrive/School/Harvard/Fall2021/BST260/BST_260_project/Data/injuries_only.csv")
head(injuries)
dim(injuries)
## [1] 17362    27
injuries <- injuries %>% mutate(injury_types = tolower(injury_types))
# Replace any "/" with " " ?
injuries$injury_types <- gsub("/", " ", injuries$injury_types)

Body part vectors

Head <- c("concussion", "head", "neck", "eye", "ear", "chin", "migranes", "migrane", "jaw", "nose", "tooth", "stinger", "faciallaceration", "eyelid", "facial", "mouth", "nose", "throat", "seizure")

Shoulder <- c("shoulder", "clavicle", "collarbone", "leftshoulder", "rightshoulder", "jointshoulder", "scapula", "sternoclavicular")

UpperTorso <- c("rib", "back", "pectoral", "pec", "chest", "oblique", "lumbar", "core", "spine", "trapezius", "kidney", "lung", "heart", "abdomen", "abdominal", "solarplexus", "spleen", "hernia", "arrhythmia", "liver", "stomach", "appendix")

LowerTorso <- c("glute", "buttocks", "hip", "righthip", "pelvis", "tailbone")


Arm <- c("arm", "rightupperarm", "forearm", "bicep", "biceps", "elbow", "rightelbow", "tricep", "triceps")

Hand <- c("hand", "righthand", "thumb", "rightthumb", "finger", "rightfinger", "wrist", "rightwrist")

Leg <- c("hamstring", "righthamstring", "knee", "rightknee", "acl", "mcl", "meniscus", "bothknees", "thigh", "rightthigh", "calf", "rightcalf", "quadricep", "rightquadricep", "quad", "groin", "rightgroin", "lowerleg", "tibia", "fibula", "shin", "rightshin", "adductor", "contusion")

Foot <- c("foot", "rightfoot", "ankle", "achilles", "toe", "toes", "heel")

Function to count broad body part injuries

# Bodypart input must be a string
broad <- function(bodylist, bodypart) {
  
  pasted <- paste(bodylist, collapse = "|")
  broad_injury <- gsub(pasted, bodypart, injuries$injury_types)
  str_count(broad_injury, pattern = bodypart)

  }

Calculating broad injury counts

# Head
head_counts <- broad(Head, "head")

# Shoulder
shoulder_counts <- broad(Shoulder, "shoulder")

# Upper Torso
uppertorso_counts <- broad(UpperTorso, "uppertorso")

# Lower Torso
lowertorso_counts <- broad(LowerTorso, "lowertorso")

# Arm
arm_counts <- broad(Arm, "arm")

# Hand
hand_counts <- broad(Hand, "hand")

# Leg
leg_counts <- broad(Leg, "leg")

# Foot
foot_counts <- broad(Foot, "foot")
injuries

Adding body part columns to the orginal injury data frame

injuries <- injuries %>% 
  mutate(head = head_counts) %>%
  mutate(shoulder = shoulder_counts) %>%
  mutate(upper_torso = uppertorso_counts) %>%
  mutate(lower_torso = lowertorso_counts) %>%
  mutate(arm = arm_counts) %>%
  mutate(hand = hand_counts) %>%
  mutate(leg = leg_counts) %>%
  mutate(foot = foot_counts)
  

injuries

Save the cleaned data frame

write.csv(injuries,"C:/Users/wduff/OneDrive/School/Harvard/Fall2021/BST260/BST_260_project/cleaneddata.csv", row.names = TRUE)

Gathering into long format

injury_gather <- gather(injuries, key="bodypart", value="counts", 28:ncol(injuries))
injury_gather

Overall distribution of body part injuries

injury_gather %>% 
  group_by(bodypart) %>%
  summarise(counts = sum(counts)) %>%
  ggplot(aes(x = reorder(bodypart, -counts), y = counts)) +
  geom_col() +
  xlab("Body Part") +
  ylab("Count") +
  ggtitle("Distriubtion of Injuries by Body Part")

Now for the next part at EDA, let’s look at the injury distributions across the various positions in football

levels(as.factor(injuries$position_id))
## [1] ""    "DEF" "K"   "OL"  "P"   "QB"  "RB"  "TE"  "WR"

We see we have the following positions: Kicker (K), Offensive Line (OL), Punter (P), Quarter Back (QB), Running Back (RB), Tight End (TE), Wide Reciever (WR) and Defense (DEF).

Since Defense has it’s own category with no specific position (like Linebacker, Defensive Line or Safety), let’s first compare the injury distributions between Offensive Players and Defensive Players

offensive_position <- c("K", "OL", "P", "QB", "RB", "TE", "WR")

offense <- injury_gather %>% filter(position_id %in% offensive_position)
defense <- injury_gather %>% filter(position_id == "DEF") 

levels(as.factor(offense$position_id))
## [1] "K"  "OL" "P"  "QB" "RB" "TE" "WR"
levels(as.factor(defense$position_id))
## [1] "DEF"
dim(offense)
## [1] 61696    29
dim(defense)
## [1] 58472    29

We have 7712 offensive players and 72309 defensive players which is great since the data sets are somewhat balanced and therefore comparing them will be valid.

Overall distribution of offensive injuries

offense %>% 
  group_by(bodypart) %>%
  summarise(counts = sum(counts)) %>%
  ggplot(aes(x = reorder(bodypart, -counts), y = counts)) +
  geom_col() +
  xlab("Body Part") +
  ylab("Count") +
  ggtitle("Distriubtion of Offensive Injuries")

Overall distribution of defensive injuries

defense %>% 
  group_by(bodypart) %>%
  summarise(counts = sum(counts)) %>%
  ggplot(aes(x = reorder(bodypart, -counts), y = counts)) +
  geom_col() +
  xlab("Body Part") +
  ylab("Count") +
  ggtitle("Distriubtion of Defensive Injuries")

RSHINY: Dropbox (all injuries, offensive injuries, defensive injuries) Slider (age) ?? min, max

injury_gather %>% 
  group_by(team) %>%
  summarise(counts = sum(counts)) %>%
  ggplot(aes(x = reorder(team, -counts), y = counts)) +
  geom_col() +
  xlab("NFL Team") +
  ylab("Count") +
  ggtitle("Distriubtion of Injuries by Team")

injury_gather %>% 
  group_by(year) %>%
  summarise(counts = sum(counts)) %>%
  ggplot(aes(x = year, y = counts)) +
  geom_col() +
  xlab("Year") +
  ylab("Count") +
  ggtitle("Distriubtion of Injuries by Year")

injury_gather %>% 
  group_by(height_inches) %>%
  summarise(counts = sum(counts)) %>%
  ggplot(aes(x = height_inches, y = counts)) +
  geom_col() +
  xlab("Height (in)") +
  ylab("Count") +
  ggtitle("Distriubtion of Injuries by Height")
## Warning: Removed 1 rows containing missing values (position_stack).

injury_gather %>% 
  group_by(weight_pounds) %>%
  summarise(counts = sum(counts)) %>%
  ggplot(aes(x = weight_pounds, y = counts)) +
  geom_col() +
  xlab("Weight (lbs)") +
  ylab("Count") +
  ggtitle("Distriubtion of Injuries by Weight")
## Warning: Removed 1 rows containing missing values (position_stack).

injury_age <- injury_gather %>% mutate(age = floor(as.numeric(difftime(Sys.Date(),injury_gather$birthdate, units = "weeks"))/52.25))

injury_age
injury_age %>% 
  group_by(age) %>%
  summarise(counts = sum(counts)) %>%
  ggplot(aes(x = age, y = counts)) +
  geom_col() +
  xlab("Age") +
  ylab("Count") +
  ggtitle("Distriubtion of Injuries by Age")
## Warning: Removed 1 rows containing missing values (position_stack).

injury_age %>% 
  group_by(bodypart) %>%
  filter(age == 37) %>%
  summarise(counts = sum(counts)) %>%
  ggplot(aes(x = reorder(bodypart, - counts), y = counts)) +
  geom_col() +
  xlab("Age = 37 Years") +
  ylab("Count") +
  ggtitle("Distriubtion of Injuries 37 Year Old NFL players")